From 3df656413096ce00b7b827626190a7907f1b688a Mon Sep 17 00:00:00 2001 From: "iap10@freefall.cl.cam.ac.uk" Date: Sun, 13 Feb 2005 20:31:37 +0000 Subject: [PATCH] bitkeeper revision 1.1194.1.1 (420fb929A6i2BgwaqAFiDYqZMrpIQg) Re-organise guest_table, shadow_table and monitor_table so that they always have the meaning their names suggest i.e. in the hypervisor CR3 always contains monitor_table. After updating guest_table or any of the shadow state remeber to call update_pagetables(ed) One side-effect of this change is that VMX guests now start off with shadow_mode set to full_32, but actually running on the 1:1 physmap. We don't actually call update_pagetables() until the VCPU enables paging, hence ensuring that linear_pg_table is NULL so we bail out early in shadow_fault if a vmx guest tries to access outside its memory map. We'll need this for SMP VMX guests so that each VCPU can enable paging independently. We might need to think further about this for guests that do IO without paging on - possibly having a generated pseudo phys pt that the full shadow mode can translate and shadow. Signed-off-by: ian@xensource.com --- xen/arch/x86/domain.c | 31 ++++++++++++++++----- xen/arch/x86/mm.c | 26 ++---------------- xen/arch/x86/shadow.c | 43 ++++++++++++++++++++++-------- xen/arch/x86/vmx.c | 28 ++++++++++++------- xen/arch/x86/vmx_vmcs.c | 1 - xen/arch/x86/x86_32/domain_build.c | 6 +++-- xen/arch/x86/x86_64/domain_build.c | 5 +++- xen/common/domain.c | 3 --- xen/include/asm-x86/domain.h | 2 +- xen/include/asm-x86/shadow.h | 34 ++++++++++++++++------- 10 files changed, 109 insertions(+), 70 deletions(-) diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index ed987e1503..65753330d9 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -259,6 +259,8 @@ void arch_do_createdomain(struct exec_domain *ed) d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] = mk_l3_pgentry(__pa(d->arch.mm_perdomain_l2) | __PAGE_HYPERVISOR); #endif + + shadow_lock_init(d); } } @@ -290,13 +292,15 @@ void arch_vmx_do_launch(struct exec_domain *ed) reset_stack_and_jump(vmx_asm_do_launch); } -static void monitor_mk_pagetable(struct exec_domain *ed) +static void alloc_monitor_pagetable(struct exec_domain *ed) { unsigned long mpfn; l2_pgentry_t *mpl2e, *phys_table; struct pfn_info *mpfn_info; struct domain *d = ed->domain; + ASSERT(!ed->arch.monitor_table); /* we should only get called once */ + mpfn_info = alloc_domheap_page(NULL); ASSERT( mpfn_info ); @@ -309,7 +313,6 @@ static void monitor_mk_pagetable(struct exec_domain *ed) HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); ed->arch.monitor_table = mk_pagetable(mpfn << PAGE_SHIFT); - d->arch.shadow_mode = SHM_full_32; mpl2e[l2_table_offset(PERDOMAIN_VIRT_START)] = mk_l2_pgentry((__pa(d->arch.mm_perdomain_pt) & PAGE_MASK) @@ -327,7 +330,7 @@ static void monitor_mk_pagetable(struct exec_domain *ed) /* * Free the pages for monitor_table and guest_pl2e_cache */ -static void monitor_rm_pagetable(struct exec_domain *ed) +static void free_monitor_pagetable(struct exec_domain *ed) { l2_pgentry_t *mpl2e; unsigned long mpfn; @@ -382,7 +385,6 @@ static int vmx_final_setup_guest(struct exec_domain *ed, goto out; } - monitor_mk_pagetable(ed); ed->arch.schedule_tail = arch_vmx_do_launch; clear_bit(VMX_CPU_STATE_PG_ENABLED, &ed->arch.arch_vmx.cpu_state); @@ -394,12 +396,20 @@ static int vmx_final_setup_guest(struct exec_domain *ed, if (ed == ed->domain->exec_domain[0]) { /* * Required to do this once per domain + * XXX todo: add a seperate function to do these. */ memset(&ed->domain->shared_info->evtchn_mask[0], 0xff, sizeof(ed->domain->shared_info->evtchn_mask)); clear_bit(IOPACKET_PORT, &ed->domain->shared_info->evtchn_mask[0]); + + /* Put the domain in shadow mode even though we're going to be using + * the shared 1:1 page table initially. It shouldn't hurt */ + shadow_mode_enable(ed->domain, SHM_full_32); } + update_pagetables(ed); /* this assigns shadow_pagetable */ + alloc_monitor_pagetable(ed); /* this assigns monitor_pagetable */ + return 0; out: @@ -409,6 +419,8 @@ out: } #endif + +/* This is called by arch_final_setup_guest and do_boot_vcpu */ int arch_final_setup_guest( struct exec_domain *d, full_execution_context_t *c) { @@ -467,8 +479,8 @@ int arch_final_setup_guest( d->arch.failsafe_address = c->failsafe_callback_eip; phys_basetab = c->pt_base; - d->arch.guest_table = mk_pagetable(phys_basetab); - d->arch.phys_table = d->arch.guest_table; + d->arch.guest_table = d->arch.phys_table = mk_pagetable(phys_basetab); + if ( !get_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT], d->domain, PGT_base_page_table) ) return -EINVAL; @@ -490,6 +502,9 @@ int arch_final_setup_guest( return vmx_final_setup_guest(d, c); #endif + update_pagetables(d); /* this assigns shadow_pagetable + and monitor_table */ + return 0; } @@ -639,6 +654,7 @@ static void switch_segments( { n->arch.flags |= TF_kernel_mode; __asm__ __volatile__ ( "swapgs" ); + update_pagetables(ed); write_ptbase(n); } @@ -663,6 +679,7 @@ long do_switch_to_user(void) ed->arch.flags &= ~TF_kernel_mode; __asm__ __volatile__ ( "swapgs" ); + update_pagetables(ed); write_ptbase(ed); regs->rip = stu.rip; @@ -929,7 +946,7 @@ static void vmx_domain_relinquish_memory(struct exec_domain *ed) free_vmcs(ed->arch.arch_vmx.vmcs); ed->arch.arch_vmx.vmcs = 0; - monitor_rm_pagetable(ed); + free_monitor_pagetable(ed); rem_ac_timer(&(vpit->pit_timer)); } #endif diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index 09819b958c..4a53fb37d2 100644 --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -196,28 +196,7 @@ void arch_init_memory(void) void write_ptbase(struct exec_domain *ed) { - struct domain *d = ed->domain; - unsigned long pa; - -#ifdef CONFIG_VMX - if ( unlikely(shadow_mode(d)) ) - pa = ((shadow_mode(d) == SHM_full_32) ? - pagetable_val(ed->arch.monitor_table) : - pagetable_val(ed->arch.shadow_table)); - else - pa = pagetable_val(ed->arch.guest_table); -#else - if ( unlikely(shadow_mode(d)) ) - pa = pagetable_val(ed->arch.shadow_table); -#ifdef __x86_64__ - else if ( !(ed->arch.flags & TF_kernel_mode) ) - pa = pagetable_val(ed->arch.guest_table_user); -#endif - else - pa = pagetable_val(ed->arch.guest_table); -#endif - - write_cr3(pa); + write_cr3(pagetable_val(ed->arch.monitor_table)); } static void __invalidate_shadow_ldt(struct exec_domain *d) @@ -1251,8 +1230,7 @@ int new_guest_cr3(unsigned long pfn) percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB; old_base_pfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT; ed->arch.guest_table = mk_pagetable(pfn << PAGE_SHIFT); - - shadow_mk_pagetable(ed); + update_pagetables(ed); /* update shadow_table and monitor_table */ write_ptbase(ed); diff --git a/xen/arch/x86/shadow.c b/xen/arch/x86/shadow.c index 0611c83797..6e509518fd 100644 --- a/xen/arch/x86/shadow.c +++ b/xen/arch/x86/shadow.c @@ -45,7 +45,7 @@ static inline void free_shadow_page( free_domheap_page(page); } -static void free_shadow_state(struct domain *d) +void free_shadow_state(struct domain *d) { int i, free = 0; struct shadow_status *x, *n; @@ -166,15 +166,20 @@ void shadow_mode_init(void) { } -int shadow_mode_enable(struct domain *d, unsigned int mode) + +int __shadow_mode_enable(struct domain *d, unsigned int mode) { - d->arch.shadow_ht = xmalloc_array(struct shadow_status, shadow_ht_buckets); - if ( d->arch.shadow_ht == NULL ) - goto nomem; - memset(d->arch.shadow_ht, 0, + if (!d->arch.shadow_ht) + { + d->arch.shadow_ht = xmalloc_array(struct shadow_status, shadow_ht_buckets); + if ( d->arch.shadow_ht == NULL ) + goto nomem; + + memset(d->arch.shadow_ht, 0, shadow_ht_buckets * sizeof(struct shadow_status)); + } - if ( mode == SHM_logdirty ) + if ( mode == SHM_logdirty && !d->arch.shadow_dirty_bitmap) { d->arch.shadow_dirty_bitmap_size = (d->max_pages + 63) & ~63; d->arch.shadow_dirty_bitmap = @@ -191,7 +196,6 @@ int shadow_mode_enable(struct domain *d, unsigned int mode) d->arch.shadow_mode = mode; - __shadow_mk_pagetable(d->exec_domain[0]); /* XXX SMP */ return 0; nomem: @@ -201,6 +205,15 @@ int shadow_mode_enable(struct domain *d, unsigned int mode) return -ENOMEM; } +int shadow_mode_enable(struct domain *d, unsigned int mode) +{ + int rc; + shadow_lock(d); + rc = __shadow_mode_enable(d, mode); + shadow_unlock(d); + return rc; +} + void __shadow_mode_disable(struct domain *d) { struct shadow_status *x, *n; @@ -240,6 +253,7 @@ static int shadow_mode_table_op( { unsigned int op = sc->op; int i, rc = 0; + struct exec_domain *ed; ASSERT(spin_is_locked(&d->arch.shadow_lock)); @@ -344,7 +358,10 @@ static int shadow_mode_table_op( SH_VLOG("shadow mode table op : page count %d", d->arch.shadow_page_count); shadow_audit(d, 1); - __shadow_mk_pagetable(d->exec_domain[0]); /* XXX SMP */ + + for_each_exec_domain(d,ed) + __update_pagetables(ed); + return rc; } @@ -352,6 +369,7 @@ int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc) { unsigned int op = sc->op; int rc = 0; + struct exec_domain *ed; if ( unlikely(d == current->domain) ) { @@ -372,12 +390,12 @@ int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc) case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST: shadow_mode_disable(d); - rc = shadow_mode_enable(d, SHM_test); + rc = __shadow_mode_enable(d, SHM_test); break; case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY: shadow_mode_disable(d); - rc = shadow_mode_enable(d, SHM_logdirty); + rc = __shadow_mode_enable(d, SHM_logdirty); break; default: @@ -387,6 +405,9 @@ int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc) shadow_unlock(d); + for_each_exec_domain(d,ed) + update_pagetables(ed); + domain_unpause(d); return rc; diff --git a/xen/arch/x86/vmx.c b/xen/arch/x86/vmx.c index f3bd45be7f..dd53d5dbab 100644 --- a/xen/arch/x86/vmx.c +++ b/xen/arch/x86/vmx.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #ifdef CONFIG_VMX @@ -420,24 +421,31 @@ static void mov_to_cr(int gp, int cr, struct xen_regs *regs) domain_crash(); /* need to take a clean path */ } old_base_pfn = pagetable_val(d->arch.guest_table) >> PAGE_SHIFT; + + /* We know that none of the previous 1:1 shadow pages are + * going to be used again, so might as well flush them. + * XXXX wait until the last VCPU boots before doing the flush !! + */ + shadow_lock(d->domain); + free_shadow_state(d->domain); // XXX SMP + shadow_unlock(d->domain); + /* * Now arch.guest_table points to machine physical. */ d->arch.guest_table = mk_pagetable(pfn << PAGE_SHIFT); + update_pagetables(d); VMX_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx\n", (unsigned long) (pfn << PAGE_SHIFT)); - shadow_lock(d->domain); - shadow_mode_enable(d->domain, SHM_full_32); - shadow_unlock(d->domain); - __vmwrite(GUEST_CR3, pagetable_val(d->arch.shadow_table)); /* - * mm->shadow_table should hold the next CR3 for shadow + * arch->shadow_table should hold the next CR3 for shadow */ VMX_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, pfn = %lx\n", d->arch.arch_vmx.cpu_cr3, pfn); + /* undo the get_page done in the para virt case */ put_page_and_type(&frame_table[old_base_pfn]); } @@ -448,11 +456,11 @@ static void mov_to_cr(int gp, int cr, struct xen_regs *regs) unsigned long pfn; /* - * If paging is not enabled yet, simply copy the valut to CR3. + * If paging is not enabled yet, simply copy the value to CR3. */ if (!test_bit(VMX_CPU_STATE_PG_ENABLED, &d->arch.arch_vmx.cpu_state)) { d->arch.arch_vmx.cpu_cr3 = value; - return; + break; } guest_pl2e_cache_invalidate(d); @@ -484,10 +492,10 @@ static void mov_to_cr(int gp, int cr, struct xen_regs *regs) } pfn = phys_to_machine_mapping(value >> PAGE_SHIFT); vmx_shadow_clear_state(d->domain); - d->arch.guest_table = mk_pagetable(pfn << PAGE_SHIFT); - shadow_mk_pagetable(d); + d->arch.guest_table = mk_pagetable(pfn << PAGE_SHIFT); + update_pagetables(d); /* - * mm->shadow_table should hold the next CR3 for shadow + * arch.shadow_table should now hold the next CR3 for shadow */ d->arch.arch_vmx.cpu_cr3 = value; VMX_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx\n", diff --git a/xen/arch/x86/vmx_vmcs.c b/xen/arch/x86/vmx_vmcs.c index da68139dab..c4f7daec44 100644 --- a/xen/arch/x86/vmx_vmcs.c +++ b/xen/arch/x86/vmx_vmcs.c @@ -219,7 +219,6 @@ void vmx_do_launch(struct exec_domain *ed) error |= __vmwrite(GUEST_TR_BASE, 0); error |= __vmwrite(GUEST_TR_LIMIT, 0xff); - ed->arch.shadow_table = ed->arch.guest_table; __vmwrite(GUEST_CR3, pagetable_val(ed->arch.guest_table)); __vmwrite(HOST_CR3, pagetable_val(ed->arch.monitor_table)); __vmwrite(HOST_ESP, (unsigned long)get_stack_bottom()); diff --git a/xen/arch/x86/x86_32/domain_build.c b/xen/arch/x86/x86_32/domain_build.c index 1147a3a66a..b0854f15a9 100644 --- a/xen/arch/x86/x86_32/domain_build.c +++ b/xen/arch/x86/x86_32/domain_build.c @@ -307,6 +307,9 @@ int construct_dom0(struct domain *d, d->shared_info->vcpu_data[i].evtchn_upcall_mask = 1; d->shared_info->n_vcpu = smp_num_cpus; + /* setup shadow and monitor tables */ + update_pagetables(ed); + /* Install the new page tables. */ __cli(); write_ptbase(ed); @@ -381,9 +384,8 @@ int construct_dom0(struct domain *d, #ifndef NDEBUG if (0) /* XXXXX DO NOT CHECK IN ENABLED !!! (but useful for testing so leave) */ { - shadow_lock(d); shadow_mode_enable(d, SHM_test); - shadow_unlock(d); + update_pagetable(ed); /* XXX SMP */ } #endif diff --git a/xen/arch/x86/x86_64/domain_build.c b/xen/arch/x86/x86_64/domain_build.c index dfa9769980..1ae5604a1a 100644 --- a/xen/arch/x86/x86_64/domain_build.c +++ b/xen/arch/x86/x86_64/domain_build.c @@ -1,4 +1,4 @@ -/* -*- Mode:C; c-basic-offset:4; tab-width:4; indent-tabs-mode:nil -*- */ +/* -*- Modes:C; c-basic-offset:4; tab-width:4; indent-tabs-mode:nil -*- */ /****************************************************************************** * domain_build.c * @@ -328,6 +328,9 @@ int construct_dom0(struct domain *d, d->shared_info->vcpu_data[i].evtchn_upcall_mask = 1; d->shared_info->n_vcpu = smp_num_cpus; + /* setup shadow and monitor tables */ + update_pagetable(ed); + /* Install the new page tables. */ __cli(); write_ptbase(ed); diff --git a/xen/common/domain.c b/xen/common/domain.c index 081c68820f..8cb023ad52 100644 --- a/xen/common/domain.c +++ b/xen/common/domain.c @@ -40,8 +40,6 @@ struct domain *do_createdomain(domid_t dom_id, unsigned int cpu) atomic_set(&d->refcnt, 1); atomic_set(&ed->pausecnt, 0); - shadow_lock_init(d); - d->id = dom_id; ed->processor = cpu; d->create_time = NOW(); @@ -330,7 +328,6 @@ long do_boot_vcpu(unsigned long vcpu, full_execution_context_t *ctxt) ed = d->exec_domain[vcpu]; atomic_set(&ed->pausecnt, 0); - shadow_lock_init(d); memcpy(&ed->arch, &idle0_exec_domain.arch, sizeof(ed->arch)); diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h index 70c4d1b1f3..62d07029b3 100644 --- a/xen/include/asm-x86/domain.h +++ b/xen/include/asm-x86/domain.h @@ -124,7 +124,7 @@ struct arch_exec_domain #define IDLE0_ARCH_EXEC_DOMAIN \ { \ perdomain_ptes: 0, \ - guest_table: mk_pagetable(__pa(idle_pg_table)) \ + monitor_table: mk_pagetable(__pa(idle_pg_table)) \ } #endif /* __ASM_DOMAIN_H__ */ diff --git a/xen/include/asm-x86/shadow.h b/xen/include/asm-x86/shadow.h index 40fab94af9..86b6fd09e9 100644 --- a/xen/include/asm-x86/shadow.h +++ b/xen/include/asm-x86/shadow.h @@ -41,6 +41,7 @@ extern void shadow_l1_normal_pt_update( extern void shadow_l2_normal_pt_update(unsigned long pa, unsigned long gpde); extern void unshadow_table(unsigned long gpfn, unsigned int type); extern int shadow_mode_enable(struct domain *p, unsigned int mode); +extern void free_shadow_state(struct domain *d); #ifdef CONFIG_VMX extern void vmx_shadow_clear_state(struct domain *); @@ -723,43 +724,56 @@ static inline unsigned long gva_to_gpa(unsigned long gva) #endif /* CONFIG_VMX */ -static inline void __shadow_mk_pagetable(struct exec_domain *ed) +static inline void __update_pagetables(struct exec_domain *ed) { struct domain *d = ed->domain; unsigned long gpfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT; unsigned long smfn = __shadow_status(d, gpfn) & PSH_pfn_mask; - SH_VVLOG("0: __shadow_mk_pagetable(gpfn=%p, smfn=%p)", gpfn, smfn); + SH_VVLOG("0: __update_pagetables(gpfn=%p, smfn=%p)", gpfn, smfn); if ( unlikely(smfn == 0) ) smfn = shadow_l2_table(d, gpfn); #ifdef CONFIG_VMX else if (d->arch.shadow_mode == SHM_full_32) + { vmx_update_shadow_state(ed, gpfn, smfn); + } #endif ed->arch.shadow_table = mk_pagetable(smfn<arch.shadow_mode != SHM_full_32) + ed->arch.monitor_table = ed->arch.shadow_table; } -static inline void shadow_mk_pagetable(struct exec_domain *ed) +static inline void update_pagetables(struct exec_domain *ed) { if ( unlikely(shadow_mode(ed->domain)) ) { - SH_VVLOG("shadow_mk_pagetable( gptbase=%p, mode=%d )", + SH_VVLOG("update_pagetables( gptbase=%p, mode=%d )", pagetable_val(ed->arch.guest_table), shadow_mode(ed->domain)); shadow_lock(ed->domain); - __shadow_mk_pagetable(ed); + __update_pagetables(ed); shadow_unlock(ed->domain); - SH_VVLOG("leaving shadow_mk_pagetable:\n" - "( gptbase=%p, mode=%d ) sh=%p", - pagetable_val(ed->arch.guest_table), - shadow_mode(ed->domain), - pagetable_val(ed->arch.shadow_table) ); + SH_VVLOG("leaving update_pagetables:\n" + "( gptbase=%p, mode=%d ) sh=%p", + pagetable_val(ed->arch.guest_table), + shadow_mode(ed->domain), + pagetable_val(ed->arch.shadow_table) ); } + else +#ifdef __x86_64__ + if ( !(ed->arch.flags & TF_kernel_mode) ) + ed->arch.monitor_table = ed->arch.guest_table_user; + else +#endif + ed->arch.monitor_table = ed->arch.guest_table; + } #if SHADOW_DEBUG -- 2.30.2